*****************************************************************
* Replication directory for                                   ***
* Prime locations                                             ***
* by Gabriel M. Ahlfeldt, Thilo N.H. Albers, Kristian Behrens ***
* Published in American Economic Review: Insights             ***
*****************************************************************
* 01/2025
* Stata
version 17.0

* This do file estimates employment weights for big data establishments

* Load data set
	u "$temp/US-CBSAs/WeightsEstimation.dta", clear
	ssc install nnls, replace 	// Non-negatove least squares executes a regression 
								// where all coefficients are constrained to be non-negative following Lawson and Hanson (1974)
	
* Simple program that runs the NNLS estimation and saves the weights in a column
	capture program drop WEIGHTS
	program define WEIGHTS	// Syntax: 
								* First argument is the dependent variable
								* second argument is a macro containing the independent variables
								* third argument defines the destination for the output to be saved
								* fourth argument is an optional and contains the if condition
		display "...running estimation of weights..."
		preserve
		qui tab cbsafp `4'
			local cbsaN = r(r)
		* Save cbsa
			qui sum cbsafp `4'
				local minCBSA = r(min)
				local maxCBSA = r(max)
			if `minCBSA' == `maxCBSA' {
				local cbsa = r(mean)
			}
			else {
				local cbsa = 0
			}
		* Run regression
		qui nnls `1' `2' `4'
		display "...saving results..."
		* Save weights
		qui matrix b = e(Weights)
		qui svmat double b, names(col)
		capture postclose coef_file
		qui tempfile coefdata
		qui postfile coef_file str32 varname double coef using `coefdata'
		qui local varlist: colnames e(Weights)
		qui foreach var in `varlist'{
			local coef = e(Weights)[1,`var']
			post coef_file ("`var'") (`coef')
		}
		* Save variables and labels
		qui gen variable = ""	
		qui gen varlabel = ""	
		qui local count = 1
		qui foreach name in `2' {
		local varlab: variable label `name'
		replace variable = "`name'" if _n == `count'				
		replace varlabel = "`varlab'" if _n == `count'		
		local count = `count'+1	
		}	
		* Clean data set
		qui keep cbsafp variable  varlabel Weights
		qui order cbsafp variable  varlabel Weights
		qui drop if Weights == .
		* Add city count variables
			local newN = _N +2
			qui set obs `newN'
			qui replace variable = "N" if _n == _N-1
			qui replace varlabel = "Number of CBSAs" if _n == _N	-1		
			qui replace Weights = `cbsaN' if _n == _N-1
			qui replace variable = "Pseudo R2" if _n == _N
			qui replace varlabel = "Share of variance explained" if _n == _N
			* qui replace Weights = `r2' if _n == _N
			qui replace cbsafp = `cbsa'
		* Save data set	
		qui save "`3'", replace
		display "...done..."
		restore
	end	
* Program defined
	
* Define list of establishment types
	local list accounting consultancy insurance investmentbank lawfirm  accounting_global   centralbank_global consultancy_global insurance_global investmentbank_global lawfirm_global  stockexchange_global tsheadquarter_global	

* Define list of CBSAs for the loop
	local USMETROS 12060  12420  14460  16740  16980 17140  17460  18140  19100  19740 19820  26420  26900  28140  29820 30460  31080  33100  33340  33460 34980  35620  36740  37980  38060 38300  38900  39300  40140  40900 41180  41700  41740  41860  41940 42660  45300  47260  47900 

********************************************************************************	
* Estimate weights by employment type and correlate with city size

* Load data set
	u "$temp/US-CBSAs/WeightsEstimation.dta", clear

* Loop over CBSA and compute employment weights by CBSA and establishment type		
	foreach USmid of numlist `USMETROS' {
	display "...`USmid'..."
	WEIGHTS employment "`list'" "$temp/EmpWeights_`USmid'.dta" "if cbsafp == `USmid'"
	}
	
* Append data
	clear
	foreach USmid of numlist `USMETROS' {
	append using "$temp/EmpWeights_`USmid'.dta"
	}
	
* Merge employment data
	merge m:1 cbsafp using "$dataoutput/MSA-GlobalCities/metro_pop2015_Overlap_GlobalCity_USMETRO_DATA"
		drop if _m == 2 // using data contains plenty of CBSAs that are not in the global cities sample
		drop _m	
		gen lpop = ln(metropop_2015) 

* Estimate weights as function of metro population
	* Create placeholders
	gen weight_predicted = .
	gen weight_intercept = .
	gen weight_intercept_se = .
	gen weight_slope = .
	gen weight_slope_se = .
	gen pr2 = .
	
	* Loop Poisson regressions over establishment types
	foreach name in `list' {
		poisson Weights lpop if variable == "`name'"
		replace weight_intercept = _b[_cons] if variable == "`name'"
		replace weight_intercept_se = _se[_cons] if variable == "`name'"
		replace weight_slope = _b[lpop] if variable == "`name'"
		replace weight_slope_se = _se[lpop] if variable == "`name'"
		replace pr2 = e(r2_p) if variable == "`name'"
		predict temp
		replace weight_predicted = temp if variable == "`name'"
		drop temp
	}
	
* Clean and save data
	keep variable varlabel weight_intercept* weight_slope* pr2 
	drop if pr2 == .
	duplicates drop
	label var weight_intercept "Constant"
	label var weight_intercept_se "S.E."
	label var weight_slope "City size elasticity"
	label var weight_slope_se "S.E."
	label var pr2 "Pseudo R2"
	label var varlab "Establishment type"
	capture mkdir "$dataoutput/weights"
	save "$dataoutput/weights/WeightsResults-Continuous.dta", replace
	local list accounting consultancy insurance investmentbank lawfirm  accounting_global   centralbank_global consultancy_global insurance_global investmentbank_global lawfirm_global  stockexchange_global tsheadquarter_global	

* Read results to scalars
	foreach name in `list' {
		sum weight_intercept if variable == "`name'", meanonly
		scalar `name'_int_b = r(mean)
		sum weight_intercept_se if variable == "`name'", meanonly
		scalar `name'_int_se = r(mean)
		sum weight_slope if variable == "`name'", meanonly
		scalar `name'_slope_b = r(mean)
		sum weight_slope_se if variable == "`name'", meanonly
		scalar `name'_slope_se = r(mean)	
	}
	
* Illustrate weights in a figure 
	gen EW_05 = exp(weight_intercept+weight_slope*ln(500000))	
	gen EW_5 = exp(weight_intercept+weight_slope*ln(5000000))	
	graph hbar (mean) EW_05 (mean) EW_5, over(varlabel) bar(1, fcolor(black%10) lcolor(black)) bar(2, fcolor(black) lcolor(black)) graphregion(color(white)) legend(order(1 "Metro population = 0.5M" 2 "Metro population = 5M") cols(2) pos(6)) xsize(10) ysize(5)
* Write Appendix Figure B.3.1	
	capture mkdir "$figures_App/weights"
	graph export "$figures_App/weights/FIG_B3_1_Weights.pdf", replace

* Save Tex table 
	foreach var of varlist weight_intercept* weight_slope* pr2  {
		tostring `var', replace force  format(%9.3f) 
	}
	drop variable
	drop EW_05 EW_5
* Write Table B.3.1
	capture mkdir "$tables_App/weights"
	texsave * using "$tables_App/weights/TAB_B3_1_Weights-Continuous.tex", title("Employment weights by city size") size("footnotesize") ///
	width(15cm) align(lccccc)  varlabels replace  frag  ///
	footnote("In the first step we estimate employment weights by CBSA and establishment type by regressing employment against establishment counts within randomly drawn disks with 750m radius using non-negative least squares method following Lawson and Hanson (1974). In the second step we estimate the city size elasticity by regressing employment weights against ln CBSA population using PPML.")		
	
********************************************************************************
* Repeat using only the first half from the alphabet

* Load data set
	u "$temp/US-CBSAs/WeightsEstimation.dta", clear
	sum cbsa, d
	gen FirstAlphabet = cbsa < r(p50)
	keep if FirstAlphabet == 1

* New list to be processed
	levelsof  cbsafp, local(USMETROIDS)		
	
* Append data
	clear
	foreach USmid of numlist `USMETROIDS' {
	append using "$temp/EmpWeights_`USmid'.dta"
	}
* Merge employment data
	merge m:1 cbsafp using "$dataoutput/MSA-GlobalCities/metro_pop2015_Overlap_GlobalCity_USMETRO_DATA"
		drop if _m == 2 // using data contains plenty of CBSAs that are not in the global cities sample
		drop _m	
		gen lpop = ln(metropop_2015) 

* Estimate weights as function of metro population
	* Create placeholders
	gen weight_predicted = .
	gen weight_intercept = .
	gen weight_intercept_se = .
	gen weight_slope = .
	gen weight_slope_se = .
	gen pr2 = .
	
	* Loop Poisson regressions over establishment types
	foreach name in `list' {
	poisson Weights lpop if variable == "`name'"
	replace weight_intercept = _b[_cons] if variable == "`name'"
	replace weight_intercept_se = _se[_cons] if variable == "`name'"
	replace weight_slope = _b[lpop] if variable == "`name'"
	replace weight_slope_se = _se[lpop] if variable == "`name'"
	replace pr2 = e(r2_p) if variable == "`name'"
	predict temp
	replace weight_predicted = temp if variable == "`name'"
	drop temp
	}
	
* Clean and save data
	keep variable varlabel weight_intercept* weight_slope* pr2 
	drop if pr2 == .
	duplicates drop
	label var weight_intercept "Constant"
	label var weight_intercept_se "S.E."
	label var weight_slope "City size elasticity"
	label var weight_slope_se "S.E."
	label var pr2 "Pseudo R2"
	label var varlab "Establishment type"
	save "$dataoutput/weights/WeightsResults-Continuous-first.dta", replace
	local list accounting consultancy insurance investmentbank lawfirm  accounting_global   centralbank_global consultancy_global insurance_global investmentbank_global lawfirm_global  stockexchange_global tsheadquarter_global	

* Read results to scalars
	foreach name in `list' {
		sum weight_intercept if variable == "`name'", meanonly
		scalar `name'_int_b = r(mean)
		sum weight_intercept_se if variable == "`name'", meanonly
		scalar `name'_int_se = r(mean)
		sum weight_slope if variable == "`name'", meanonly
		scalar `name'_slope_b = r(mean)
		sum weight_slope_se if variable == "`name'", meanonly
		scalar `name'_slope_se = r(mean)	
	}
	
* Illustrate weights in a figure (not reported in paper)
	gen EW_05 = exp(weight_intercept+weight_slope*ln(500000))	
	gen EW_5 = exp(weight_intercept+weight_slope*ln(5000000))	
	graph hbar (mean) EW_05 (mean) EW_5, over(varlabel) bar(1, fcolor(black%10) lcolor(black)) bar(2, fcolor(black) lcolor(black)) graphregion(color(white)) legend(order(1 "Metro population = 0.5M" 2 "Metro population = 5M") cols(2) pos(6)) xsize(10) ysize(5)
	graph export "$figures_App/weights/FIG-B3-Weights-first.pdf", replace


* Save Tex table 
foreach var of varlist weight_intercept* weight_slope* pr2  {
		tostring `var', replace force  format(%9.3f) 
	}
* Save results as table (not reported in paper)
	drop variable
	drop EW_05 EW_5
	texsave * using "$tables_App/weights/TAB_B3-Continuous-first.tex", title("Employment weights by city size") size("footnotesize")   width(15cm) align(lccccc)  varlabels replace  frag  footnote("In the first step we estimate employment weights by CBSA and establishment type by regressing employment against establishment counts within randomly drawn disks with 750m radius using non-negative least squares method following Lawson and Hanson (1974). In the second step we estimate the city size elasticity by regressing employment weights against ln CBSA population using PPML.")			

	
********************************************************************************
* Repeat using only the second half from the alphabet

* Load data set
	u "$temp/US-CBSAs/WeightsEstimation.dta", clear
	sum cbsa, d
	gen FirstAlphabet = cbsa < r(p50)
	keep if FirstAlphabet == 0

* New list to be processed
	levelsof  cbsafp, local(USMETROIDS)		
	
* Append data
	clear
	foreach USmid of numlist `USMETROIDS' {
	append using "$temp/EmpWeights_`USmid'.dta"
	}
	
* Merge employment data
	merge m:1 cbsafp using "$dataoutput/MSA-GlobalCities/metro_pop2015_Overlap_GlobalCity_USMETRO_DATA"
		drop if _m == 2 // using data contains plenty of CBSAs that are not in the global cities sample
		drop _m	
		gen lpop = ln(metropop_2015) 

* Estimate weights as function of metro population
	* Create placeholders
	gen weight_predicted = .
	gen weight_intercept = .
	gen weight_intercept_se = .
	gen weight_slope = .
	gen weight_slope_se = .
	gen pr2 = .
	* Loop Poisson regressions over establishment types
	foreach name in `list' {
	poisson Weights lpop if variable == "`name'"
	replace weight_intercept = _b[_cons] if variable == "`name'"
	replace weight_intercept_se = _se[_cons] if variable == "`name'"
	replace weight_slope = _b[lpop] if variable == "`name'"
	replace weight_slope_se = _se[lpop] if variable == "`name'"
	replace pr2 = e(r2_p) if variable == "`name'"
	predict temp
	replace weight_predicted = temp if variable == "`name'"
	drop temp
	}
	
* Clean and save data
	keep variable varlabel weight_intercept* weight_slope* pr2 
	drop if pr2 == .
	duplicates drop
	label var weight_intercept "Constant"
	label var weight_intercept_se "S.E."
	label var weight_slope "City size elasticity"
	label var weight_slope_se "S.E."
	label var pr2 "Pseudo R2"
	label var varlab "Establishment type"
	save "$dataoutput/weights/WeightsResults-Continuous-last.dta", replace
	local list accounting consultancy insurance investmentbank lawfirm  accounting_global   centralbank_global consultancy_global insurance_global investmentbank_global lawfirm_global  stockexchange_global tsheadquarter_global	

* Read results to scalars
	foreach name in `list' {
		sum weight_intercept if variable == "`name'", meanonly
		scalar `name'_int_b = r(mean)
		sum weight_intercept_se if variable == "`name'", meanonly
		scalar `name'_int_se = r(mean)
		sum weight_slope if variable == "`name'", meanonly
		scalar `name'_slope_b = r(mean)
		sum weight_slope_se if variable == "`name'", meanonly
		scalar `name'_slope_se = r(mean)	
	}
	
* Illustrate weights in a figure 
	gen EW_05 = exp(weight_intercept+weight_slope*ln(500000))	
	gen EW_5 = exp(weight_intercept+weight_slope*ln(5000000))	
	graph hbar (mean) EW_05 (mean) EW_5, over(varlabel) bar(1, fcolor(black%10) lcolor(black)) bar(2, fcolor(black) lcolor(black)) graphregion(color(white)) legend(order(1 "Metro population = 0.5M" 2 "Metro population = 5M") cols(2) pos(6)) xsize(10) ysize(5)
	graph export "$figures_App/weights/FIG-B3-Weights-last.pdf", replace


* Save Tex table 
foreach var of varlist weight_intercept* weight_slope* pr2  {
		tostring `var', replace force  format(%9.3f) 
	}
* Save results as table (not reported in paper)
	drop variable
	drop EW_05 EW_5
	texsave * using "$tables_App/weights/TAB_B3-Continuous-last.tex", title("Employment weights by city size") size("footnotesize")   width(15cm) align(lccccc)  varlabels replace  frag  footnote("In the first step we estimate employment weights by CBSA and establishment type by regressing employment against establishment counts within randomly drawn disks with 750m radius using non-negative least squares method following Lawson and Hanson (1974). In the second step we estimate the city size elasticity by regressing employment weights against ln CBSA population using PPML.")				

* Save CBSA sample variables (not reported in paper)
	u "$temp/US-CBSAs/WeightsEstimation.dta", clear
	sum cbsa, d
	gen FirstAlphabet = cbsa < r(p50)
	keep cbsafp FirstAlphabet
	duplicates drop
	save "$dataoutput/weights/WeightsCBSAsamples.dta", replace

* Script ends	
	